########################################################
# Project:    Commission Communication
# Task:       Identify words that are specific to the
#             Commission, but rare / hard to understand
#             for others
# Author:     Christian Rauh (15.09.2021)
########################################################


# Packages ####
library(tidyverse) # 1.3.0
library(quanteda) # 3.2.0
library(sophistication) # 0.70 - https://github.com/kbenoit/sophistication

# Other tools ####

# Edited functions from sophistication package
# which excludes stopwords from calculating term familiarity
source("./Tools/covars_make_baselines_CR.R")

# Not in ...
"%ni%" <- Negate("%in%")



# Diverging word usage ####

# Com-Nat overlap corpus as used in textmatching
raw <- read_rds("./Corpora/Overlap-EC-UK-IRE.rds") %>% 
  mutate(commission = ifelse(sender == "Commission", T, F))

# Relative word frequencies
corp <- corpus(raw$text, docvars = raw[,c("sender", "commission")])
mat <- dfm(corp, tolower = T, stem = F, split_hyphens = T,
           remove_punct = T, remove_numbers = T, remove_symbols = T, remove = stopwords("english"),
           verbose = T)
wmat <- dfm_weight(mat, scheme = "prop")

# Get differences in word usage between Comm and nat executives
# DF of differences in word usages
typefreq <- dfm_group(mat, groups = docvars(wmat)$commission) %>%    # Group by Commission as sender (see above)
  dfm_weight(scheme = "prop") %>%
  t() %>%                                                             # Transpose matrix
  convert(to = "data.frame") %>%
  rename(word = doc_id,                                               # Transposition nuissance as first column was doc_id before
         nat = `FALSE`,
         com = `TRUE`) %>%
  mutate(diff = com - nat,                                            # Difference in relative term freqs between Hitler and BT
         uniqc = ifelse(nat == 0 & com != 0, T, F),                   # Word only used by Com, but not by nat executives
         uniqn = ifelse(nat != 0 & com == 0, T, F)) %>%               # Word only used by nat executives, but not by Com
  arrange(desc(diff))

# Unique words in groups
uniqcom <- typefreq %>% filter(uniqc) %>% arrange(desc(com)) %>% rename(relfreq = com) %>%
  mutate(label = "Freq. used by the Commission\nbut never by nat. governments\nTop-30") %>% select(word, relfreq, label)
uniqnat <- typefreq %>% filter(uniqn) %>% arrange(desc(nat)) %>% rename(relfreq = nat) %>%
  mutate(label = "Freq. used by nat. govs.,\nbut never by the Commission\nTop-30") %>% select(word, relfreq, label)

topuniqwords <- rbind(head(uniqcom, 30),
                      head(uniqnat, 30))

ggplot(topuniqwords, aes(x=relfreq, y = reorder(word, relfreq)))+
  geom_col()+
  facet_wrap(.~label, scales = "free") +
  labs(title = "Unique Words in Commission and national press releases",
       x= "Relative word frequencies in texts of respective groups",
       y= "")+
  theme_bw()+
  theme(strip.text = element_text(face = "bold"),
        axis.text.y = element_text(color = "black"))

ggsave("./Plots/UniqueWords-COMvsNAT.png", width = 16, height = 12, units = "cm")


# Rel freq diffs
typefreq2 <- typefreq %>% filter(!uniqn & !uniqc) %>% # Exclude words that are unique for either group
  arrange(desc(diff))

strongdiff <- rbind(head(typefreq2, 30),      # Top 30 with highest diff (Com overweight)
                    tail(typefreq2, 30)) %>%  # Top 30 with lowest diff (Nat overweight)
  arrange(desc(diff)) %>%
  mutate(run = rep(seq(1,30,1), 2))          # Sequence to place terms neatly onto the plot (no meaning)

ggplot(strongdiff, aes(x=diff, y = run))+
  geom_vline(xintercept = 0, linetype = "dashed")+
  geom_text(aes(label = word))+
  labs(title = "Which words do nat. governments and the Commission use in different frequencies?",
       subtitle = "The Top-30 with the strongest differences in frequency, respectively. Only words used by both groups considered here.",
       x = "Difference in relative word frequency in press releases\nby the Commission and UK/IRE governments",
       y= "")+
  theme_bw()+
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.ticks.y = element_blank())

ggsave("./Plots/FigureA2_DivisiveWords-COMvsNAT.png", width = 22, height = 12, units = "cm")


# Keyness - Comm vs nat press releases ####
CommNat_key <- textstat_keyness(wmat, 
                              target = mat$commission)
textplot_keyness(CommNat_key, 
                 show_reference = F,
                 n = 30)



# Keyness - Comm vs Google books ####

# Google n-grams data as shipped with sophistication package
# features in rownames, abs freq in cells, decade as columns
ngrams <- data_matrix_google1grams 

# Keep only the decades for which we have Comm press releases
# I assume it measn books published in that decade, so I sum the absolute frequencies
# and then derive relative frequencies on the aggregate count of words in these corpora

grelfreq <- ngrams[ , c("1980", "1990", "2000")] %>% 
  rowSums() %>% 
  data.frame() %>% 
  rename(g.freq = 1) %>% 
  rownames_to_column(var = "feature")

grelfreq$g.relfreq <- grelfreq$g.freq / sum(grelfreq$g.freq) # Relative term frequencies

# The Comm press release corpus 
# treated as one big chunck of text

com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>% 
  select(text) %>% 
  paste(collapse = " ") %>% 
  str_remove_all("'s") # possesive s, not in ngrams

ccorp <- corpus(com) # qunateda corpus object
ctok <- tokens(ccorp, # qunateda corpus object
               remove_symbols = T,
               remove_numbers = T,
               remove_url = T,
               remove_separators = T,
               remove_punct = T,
               split_hyphens = T) 
cmat <- dfm(ctok) # DM, absolute frequencies
cwmat <- dfm_weight(cmat, scheme = "prop") # DFM, relative frequencies

cmat <- convert(cmat, to = "matrix") %>% 
  t() %>% 
  data.frame() %>% 
  rename(c.freq = 1) %>% 
  rownames_to_column(var = "feature")
  
cwmat <- convert(cwmat, to = "matrix") %>% 
  t() %>% 
  data.frame() %>% 
  rename(c.relfreq = 1) %>% 
  rownames_to_column(var = "feature")

comrelfreq <- cmat %>% 
  left_join(cwmat, by = "feature")


# Join frequencies
# Note: Availability on Google Books corpus determines whether feature is covered!
# Com vocabulary notably larger - check how selection into the ngrams collection works 
# (on part of google OR quanteda)

freqs <- grelfreq %>% 
  left_join(comrelfreq, by = "feature")

# NAs in Comm corpus equal a true zero
freqs$c.relfreq[is.na(freqs$c.relfreq)] <- 0

# Comm overweight
freqs$diff <- freqs$c.relfreq - freqs$g.relfreq

# Filter a bit 
freqs <- freqs %>% 
  filter(feature %ni% quanteda::stopwords("english")) %>% 
  arrange(desc(diff))
# Country names
# Month names

freqs$diff2 <- freqs$c.relfreq / freqs$g.relfreq

freqs <- freqs %>% arrange(desc(g.relfreq)) %>% mutate(g.rank = 1:nrow(freqs))
freqs <- freqs %>% arrange(desc(c.relfreq)) %>% mutate(c.rank = 1:nrow(freqs))

freqs$rank.diff <- freqs$g.rank-freqs$c.rank

plot(freqs$rank.diff, freqs$diff)

write_rds(freqs, "./Data/WordFreqs_ComVsGoogle.rds")
write.csv2(freqs, "./Data/WordFreqs_ComVsGoogle.csv", sep = ";", row.names = F)


# Least familiar words per Comm press release ####

# Get Google books scores for all words in press release
# Keep lowest three, unlist and count aggregates





# Commission versus press #####

# Newspaper texts (BNC) 

# Clean version of the BNC corpus
bnc <- read_rds("./Corpora/BNC_RawTexts.rds") %>% 
  filter(type == "NEWS") %>% 
  filter(str_detect(category, "brdsht|tabloid")) %>% 
  mutate(newspaper = ifelse(str_detect(category, "brdsht"), "Broadsheet", "Tabloid"))

# Smaller chunks
# BNC contains samples of newspaper chunked in big blocks
# Separate them into original paragraphs here

press <- data.frame(text = character(0),
                     newspaper = character(0))

for (i in 1:nrow(bnc)) {
  print(i)
  texts <- str_split(bnc$text[i], "\n") %>% 
    data.frame() %>% 
    rename(text = 1) %>% 
    mutate(newspaper = bnc$newspaper[i])
  press <- rbind(press, texts)
}

rm(bnc)

# Clean up a little
press$text <- press$text %>% 
  str_replace_all("([a-z])([[:punct:]])([A-Z])", "\\1\\2 \\3") # Missing whitespaces after punctuation

press$newspaper <- NULL
press$sender <- "Press (BNC)"


# Commission press releases 

com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>% 
  select(text) %>% 
  mutate(sender = "Commission")

# Joint corpus
compress <- rbind(com, press) %>% 
  mutate(commission = sender == "Commission")

# Relative word frequencies
corp <- corpus(compress$text, docvars = compress[,c("sender", "commission")])
mat <- dfm(corp, tolower = T, stem = F, split_hyphens = T,
           remove_punct = T, remove_numbers = T, remove_symbols = T, remove = stopwords("english"),
           verbose = T)
wmat <- dfm_weight(mat, scheme = "prop")


# Get differences in word usage between Comm and nat executives
# DF of differences in word usages
typefreq <- dfm_group(mat, groups = docvars(wmat)$commission) %>%    # Group by Commission as sender (see above)
  dfm_weight(scheme = "prop") %>%
  t() %>%                                                             # Transpose matrix
  convert(to = "data.frame") %>%
  rename(word = doc_id,                                               # Transposition nuissance as first column was doc_id before
         press = `FALSE`,
         com = `TRUE`) %>%
  mutate(diff = com - press,                                            # Difference in relative term freqs between Hitler and BT
         uniqc = ifelse(press == 0 & com != 0, T, F),                   # Word only used by Com, but not by press
         uniqp = ifelse(press != 0 & com == 0, T, F)) %>%               # Word only used by press, but not by Com
  arrange(desc(diff))

# Unique words in groups
uniqcom <- typefreq %>% filter(uniqc) %>% arrange(desc(com)) %>% rename(relfreq = com) %>%
  mutate(label = "Freq. used by the Commission\nbut never by the press\nTop-30") %>% select(word, relfreq, label)
uniqpress <- typefreq %>% filter(uniqp) %>% arrange(desc(press)) %>% rename(relfreq = press) %>%
  mutate(label = "Freq. used by the press,\nbut never by the Commission\nTop-30") %>% select(word, relfreq, label)

topuniqwords <- rbind(head(uniqcom, 30),
                      head(uniqpress, 30))

ggplot(topuniqwords, aes(x=relfreq, y = reorder(word, relfreq)))+
  geom_col()+
  facet_wrap(.~label, scales = "free") +
  labs(title = "Unique Words in Commission press release and national press articles",
       x= "Relative word frequencies in texts of respective groups",
       y= "")+
  theme_bw()+
  theme(strip.text = element_text(face = "bold"),
        axis.text.y = element_text(color = "black"))

# ggsave("./Plots/UniqueWords-COMvsPRESS.png", width = 16, height = 12, units = "cm")

# Pretty stupid, the 1994 effect ...




# Comm vs Press, Guardian 2018-2020 ####


# Clean version of the GUA corpus
gua <- rbind(read_rds("./Corpora/GUA_2018.rds"),
             read_rds("./Corpora/GUA_2019.rds"),
             read_rds("./Corpora/GUA_2020.rds")) %>% 
  unique()
             
press <- gua %>% 
  mutate(text = paste(fields.headline, fields.bodyText, sep = ". "),
         sender = "Guardian") %>% 
  select(text, sender)

rm(gua)
gc()


# Commission press releases 

com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>% 
  filter(year >= 2018 & year < 2021) %>% 
  select(text) %>% 
  mutate(sender = "Commission")

# Joint corpus
compress <- rbind(com, press) %>% 
  mutate(commission = sender == "Commission")


# Relative word frequencies
corp <- corpus(compress$text, docvars = compress[,c("sender", "commission")])
mat <- dfm(corp, tolower = T, stem = F, split_hyphens = T,
           remove_punct = T, remove_numbers = T, remove_symbols = T, remove = stopwords("english"),
           verbose = T)
wmat <- dfm_weight(mat, scheme = "prop") #
# wmat <- dfm_tfidf(mat)

# Get differences in word usage between Comm and Guardian
# DF of differences in word usages
typefreq <- dfm_group(mat, groups = docvars(wmat)$commission) %>%    # Group by Commission as sender (see above)
  # dfm_weight(scheme = "prop") %>%
  dfm_tfidf() %>% 
  t() %>%                                                             # Transpose matrix
  convert(to = "data.frame") %>%
  rename(word = doc_id,                                               # Transposition nuissance as first column was doc_id before
         press = `FALSE`,
         com = `TRUE`) %>%
  mutate(diff = com - press,                                            # Difference in relative term freqs between Hitler and BT
         uniqc = ifelse(press == 0 & com != 0, T, F),                   # Word only used by Com, but not by press
         uniqp = ifelse(press != 0 & com == 0, T, F)) %>%               # Word only used by press, but not by Com
  arrange(desc(diff))

# Unique words in groups
uniqcom <- typefreq %>% filter(uniqc) %>% arrange(desc(com)) %>% rename(relfreq = com) %>%
  mutate(label = "Freq. used by the Commission\nbut never in the Guardian\nTop-30") %>% select(word, relfreq, label)
uniqpress <- typefreq %>% filter(uniqp) %>% arrange(desc(press)) %>% rename(relfreq = press) %>%
  mutate(label = "Freq. used in the Guardian,\nbut never by the Commission\nTop-30") %>% select(word, relfreq, label)

topuniqwords <- rbind(head(uniqcom, 30),
                      head(uniqpress, 30))

ggplot(topuniqwords, aes(x=relfreq, y = reorder(word, relfreq)))+
  geom_col()+
  facet_wrap(.~label, scales = "free") +
  labs(title = "Unique Words in Commission press release and national press articles",
       x= "Relative word frequencies in texts of respective groups",
       y= "")+
  theme_bw()+
  theme(strip.text = element_text(face = "bold"),
        axis.text.y = element_text(color = "black"))

ggsave("./Plots/UniqueWords-COMvsPRESS.png", width = 16, height = 12, units = "cm")
